Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 19 additions & 91 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1625,41 +1625,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
}
}

static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_A32_SAVE;
case 8:
return AMDGPU::SI_SPILL_A64_SAVE;
case 12:
return AMDGPU::SI_SPILL_A96_SAVE;
case 16:
return AMDGPU::SI_SPILL_A128_SAVE;
case 20:
return AMDGPU::SI_SPILL_A160_SAVE;
case 24:
return AMDGPU::SI_SPILL_A192_SAVE;
case 28:
return AMDGPU::SI_SPILL_A224_SAVE;
case 32:
return AMDGPU::SI_SPILL_A256_SAVE;
case 36:
return AMDGPU::SI_SPILL_A288_SAVE;
case 40:
return AMDGPU::SI_SPILL_A320_SAVE;
case 44:
return AMDGPU::SI_SPILL_A352_SAVE;
case 48:
return AMDGPU::SI_SPILL_A384_SAVE;
case 64:
return AMDGPU::SI_SPILL_A512_SAVE;
case 128:
return AMDGPU::SI_SPILL_A1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}

static unsigned getAVSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
Expand Down Expand Up @@ -1707,22 +1672,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
}

static unsigned getVectorRegSpillSaveOpcode(Register Reg,
const TargetRegisterClass *RC,
unsigned Size,
const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &MFI) {
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
Register Reg, const TargetRegisterClass *RC, unsigned Size,
const SIMachineFunctionInfo &MFI) const {
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);

// Choose the right opcode if spilling a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);

if (IsVectorSuperClass)
// TODO: Check if AGPRs are available
if (ST.hasMAIInsts())
return getAVSpillSaveOpcode(Size);

return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
: getVGPRSpillSaveOpcode(Size);
return getVGPRSpillSaveOpcode(Size);
}

void SIInstrInfo::storeRegToStackSlot(
Expand Down Expand Up @@ -1770,8 +1733,8 @@ void SIInstrInfo::storeRegToStackSlot(
return;
}

unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
SpillSize, RI, *MFI);
unsigned Opcode =
getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
MFI->setHasSpilledVGPRs();

BuildMI(MBB, MI, DL, get(Opcode))
Expand Down Expand Up @@ -1854,41 +1817,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
}
}

static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_A32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_A64_RESTORE;
case 12:
return AMDGPU::SI_SPILL_A96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_A128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_A160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_A192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_A224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_A256_RESTORE;
case 36:
return AMDGPU::SI_SPILL_A288_RESTORE;
case 40:
return AMDGPU::SI_SPILL_A320_RESTORE;
case 44:
return AMDGPU::SI_SPILL_A352_RESTORE;
case 48:
return AMDGPU::SI_SPILL_A384_RESTORE;
case 64:
return AMDGPU::SI_SPILL_A512_RESTORE;
case 128:
return AMDGPU::SI_SPILL_A1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
}

static unsigned getAVSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
Expand Down Expand Up @@ -1930,27 +1858,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");

if (IsVectorSuperClass)
if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be enabled. Did you find any problem while spilling AV_WWM regclasses now?
It can also be done in a separate patch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't tried, it's hard enough to test the base case. WWM can be left for later

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok.

return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;

return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
}

static unsigned
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
unsigned Size, const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &MFI) {
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
Register Reg, const TargetRegisterClass *RC, unsigned Size,
const SIMachineFunctionInfo &MFI) const {
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);

// Choose the right opcode if restoring a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);

if (IsVectorSuperClass)
// TODO: Check if AGPRs are available
if (ST.hasMAIInsts())
return getAVSpillRestoreOpcode(Size);

return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
: getVGPRSpillRestoreOpcode(Size);
assert(!RI.isAGPRClass(RC));
return getVGPRSpillRestoreOpcode(Size);
}

void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Expand Down Expand Up @@ -1998,7 +1926,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
}

unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
SpillSize, RI, *MFI);
SpillSize, *MFI);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class LiveVariables;
class MachineDominatorTree;
class MachineRegisterInfo;
class RegScavenger;
class SIMachineFunctionInfo;
class TargetRegisterClass;
class ScheduleHazardRecognizer;

Expand Down Expand Up @@ -287,6 +288,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;

unsigned getVectorRegSpillSaveOpcode(Register Reg,
const TargetRegisterClass *RC,
unsigned Size,
const SIMachineFunctionInfo &MFI) const;
unsigned
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
unsigned Size,
const SIMachineFunctionInfo &MFI) const;

void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex, const TargetRegisterClass *RC,
Expand Down
105 changes: 31 additions & 74 deletions llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
Expand All @@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: s_nop 0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
Expand All @@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
Expand Down Expand Up @@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
Expand All @@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
; SDAG-GFX942-NEXT: s_nop 0
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2)
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
Expand All @@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
Expand Down Expand Up @@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
Expand All @@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: s_nop 0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
Expand All @@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
Expand Down
Loading
Loading